Del texto guía llevar a cabo los ejercicios. 4.7.2 del 10 al 13; 8.4 del 7 al 12 y 9.7.2 del 4 al 8.
Se deben presentar en un documento de R Markdown con un estilo CCS personalizado.
Elabore un ensayo de una página argumentando cómo desde el aprendizaje estadístico se puede contribuir a solucionar el problema de la calidad del aire en Medellín.Librerias utilizadas para la elaboración del trabajo
library(ISLR)
library(MASS)
library(class)
library(randomForest)
library(tree)
library(gbm)
library(glmnet)
library(DT)
library(knitr)
library(tidyverse)
library(ggthemes)
library(e1071)
library(caret)
library(rpart)
library(rpart.plot)
library(kableExtra)
This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
pairs(Weekly)
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Las variables año y volumen paracen tener una relación, en el pairs ambas se ven crecientes, aunque una concava hacía arriba y la otra concáva hacía abajo.
attach(Weekly)
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
El Lag2 (retraso 2) tiene significancia estadística, con Pr(>|z|) = 3%
glm.probs = predict(glm.fit, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
table(glm.pred, Direction)
## Direction
## glm.pred Down Up
## Down 54 48
## Up 430 557
Renombrando los resultados de la siguiente manera: a = 54 b = 48 c = 430 d = 557 Para hallar el porcentaje de predicción correcto se realiza la siguiente operación = (a+d)/(a+b+c+d) Para hallar el porcentaje de regresion logistica correcta se calcula = d/(d+b) Para hallar el porcentaje de regresión logística incorrecta se calcula = a/(c+a) De esta manera tenemos que el porcentaje de predicciones correctas cuenta con un valor de 56.1%. La regresión logística es correcta la mayoría del tiempo con un valor del 92.1% La regresión logística es incorrecta la mayoría del tiempo con un valor del 11.2%
train = (Year < 2009)
Weekly.0910 = Weekly[!train, ]
glm.fit = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm.probs = predict(glm.fit, Weekly.0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(glm.pred, Direction.0910)
## Direction.0910
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred == Direction.0910)
## [1] 0.625
lda.fit = lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred = predict(lda.fit, Weekly.0910)
table(lda.pred$class, Direction.0910)
## Direction.0910
## Down Up
## Down 9 5
## Up 34 56
mean(lda.pred$class == Direction.0910)
## [1] 0.625
qda.fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.class = predict(qda.fit, Weekly.0910)$class
table(qda.class, Direction.0910)
## Direction.0910
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == Direction.0910)
## [1] 0.5865385
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.0910)
## Direction.0910
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == Direction.0910)
## [1] 0.5
QDA con la raiz cuadrada del valor absoluto de Lag2
qda.fit = qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)
qda.class = predict(qda.fit, Weekly.0910)$class
table(qda.class, Direction.0910)
## Direction.0910
## qda.class Down Up
## Down 12 13
## Up 31 48
mean(qda.class == Direction.0910)
## [1] 0.5769231
LDA interacción Lag2 con Lag1
lda.fit = lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
lda.pred = predict(lda.fit, Weekly.0910)
mean(lda.pred$class == Direction.0910)
## [1] 0.5769231
Se realizará la regresión con Lag2:Lag1
glm.fit = glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train)
glm.probs = predict(glm.fit, Weekly.0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(glm.pred, Direction.0910)
## Direction.0910
## glm.pred Down Up
## Down 1 1
## Up 42 60
mean(glm.pred == Direction.0910)
## [1] 0.5865385
KNN con K = 10
knn.pred = knn(train.X, test.X, train.Direction, k = 10)
table(knn.pred, Direction.0910)
## Direction.0910
## knn.pred Down Up
## Down 17 18
## Up 26 43
mean(knn.pred == Direction.0910)
## [1] 0.5769231
KNN con K = 100
# KNN k =10
knn.pred = knn(train.X, test.X, train.Direction, k = 100)
table(knn.pred, Direction.0910)
## Direction.0910
## knn.pred Down Up
## Down 9 12
## Up 34 49
mean(knn.pred == Direction.0910)
## [1] 0.5576923
De estas pruebas realizadas, el LDA original y la regresión logística tienen un mejor rendimiento si se habla de la tasa de error de prueba de estas iteraciones. ——————————————————————————————————————————
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name
## amc matador : 5
## ford pinto : 5
## toyota corolla : 5
## amc gremlin : 4
## amc hornet : 4
## chevrolet chevette: 4
## (Other) :365
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
mpg01 = rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
Auto = data.frame(Auto, mpg01)
cor(Auto[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto)
Se puede observar una autocorrelación de mpg con cilindros, peso, desplazamiento y caballos de fuerza.
Se tiene en cuenta si el año es par.
train = (year%%2 == 0)
test = !train
Auto.train = Auto[train, ]
Auto.test = Auto[test, ]
mpg01.test = mpg01[test]
Con LDA
lda.fit = lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
subset = train)
lda.pred = predict(lda.fit, Auto.test)
mean(lda.pred$class != mpg01.test)
## [1] 0.1263736
La tasa de error es de 12.7% aproximadamente.
Con QDA
qda.fit = qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
subset = train)
qda.pred = predict(qda.fit, Auto.test)
mean(qda.pred$class != mpg01.test)
## [1] 0.1318681
La tasa de error es de 13.2% aproximadamente.
Con regresion logística.
glm.fit = glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
family = binomial, subset = train)
glm.probs = predict(glm.fit, Auto.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.1208791
La tasa de error es de 12.1% aproximadamente.
train.X = cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X = cbind(cylinders, weight, displacement, horsepower)[test, ]
train.mpg01 = mpg01[train]
set.seed(1)
KNN con K = 1
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
mean(knn.pred != mpg01.test)
## [1] 0.1538462
La tasa de error es de 15.4% aproximadamente.
KNN con k = 10
knn.pred = knn(train.X, test.X, train.mpg01, k = 10)
mean(knn.pred != mpg01.test)
## [1] 0.1648352
La tasa de error es de 16.5% aproximadamente.
KNN con K = 100
knn.pred = knn(train.X, test.X, train.mpg01, k = 100)
mean(knn.pred != mpg01.test)
## [1] 0.1428571
La tasa de error es de 14.3% aproximadamente. Esta parece tener mejor rendimiento que K = 10 y K = 1
This problem involves writing functions.
Power = function() {
2^3
}
print(Power())
## [1] 8
Power2 = function(x, a) {
x^a
}
Power2(3, 8)
## [1] 6561
Power2(10, 3)
## [1] 1000
Power2(8, 17)
## [1] 2.2518e+15
Power2(131, 3)
## [1] 2248091
Power3 = function(x, a) {
result = x^a
return(result)
}
x = 1:10
plot(x, Power3(x, 2), log = "xy", ylab = "Log of y = x^2", xlab = "Log of x",
main = "Log of x^2 versus Log of x")
PlotPower = function(x, a) {
plot(x, Power3(x, a))
}
PlotPower(1:10, 3)
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
crime01 = rep(0, length(crim))
crime01[crim > median(crim)] = 1
Boston = data.frame(Boston, crime01)
train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
crime01.test = crime01[test]
Regresión logística.
glm.fit = glm(crime01 ~ . - crime01 - crim, data = Boston, family = binomial,
subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs = predict(glm.fit, Boston.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != crime01.test)
## [1] 0.1818182
La tasa de error es de 18.2% aproximadamente
glm.fit = glm(crime01 ~ . - crime01 - crim - chas - tax, data = Boston, family = binomial,
subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs = predict(glm.fit, Boston.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != crime01.test)
## [1] 0.1857708
La tasa de error es de 18.6% aproximadamente.
Con LDA
lda.fit = lda(crime01 ~ . - crime01 - crim, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
mean(lda.pred$class != crime01.test)
## [1] 0.1343874
La tasa de error es 13.5% aproximadamente
lda.fit = lda(crime01 ~ . - crime01 - crim - chas - tax, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
mean(lda.pred$class != crime01.test)
## [1] 0.1225296
La tasa de error es de 12.3% aproximadamente.
lda.fit = lda(crime01 ~ . - crime01 - crim - chas - tax - lstat - indus - age,
data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
mean(lda.pred$class != crime01.test)
## [1] 0.1185771
La tasa de error esd e 11.9% aproximadamente.
Método KNN con K = 1
train.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black,
lstat, medv)[train, ]
test.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black,
lstat, medv)[test, ]
train.crime01 = crime01[train]
set.seed(1)
# KNN(k=1)
knn.pred = knn(train.X, test.X, train.crime01, k = 1)
mean(knn.pred != crime01.test)
## [1] 0.458498
La tasa de error es de 45.9% aproximadamente.
Con K = 10
knn.pred = knn(train.X, test.X, train.crime01, k = 10)
mean(knn.pred != crime01.test)
## [1] 0.1185771
La tasa de error es de 11.9% aproximadamente.
Con K = 100
knn.pred = knn(train.X, test.X, train.crime01, k = 100)
mean(knn.pred != crime01.test)
## [1] 0.4901186
La tasa de error es de 49.1% aproximadamente.
Método KNN con K = 10 y con subconjunto de variables.
train.X = cbind(zn, nox, rm, dis, rad, ptratio, black, medv)[train, ]
test.X = cbind(zn, nox, rm, dis, rad, ptratio, black, medv)[test, ]
knn.pred = knn(train.X, test.X, train.crime01, k = 10)
mean(knn.pred != crime01.test)
## [1] 0.2727273
La tasa de error es de 27.3% aproximadamente.
Vemos que se presenta una menor tasa de error con K = 10.
Lo primero que haremos es dividir los datos de entrenamiento y de prueba
set.seed(1)
subset<-sample(1:nrow(Boston),nrow(Boston)*0.7)
Boston.train<-Boston[subset,-14]
Boston.test<-Boston[-subset,-14]
y.train<-Boston[subset,14]
y.test<-Boston[-subset,14]
Construiremos 5 modelos diferentes con el parametro de ajuste m=p,p/2,p/3,p/4,p^0.5
rfmodel1<-randomForest(Boston.train,y=y.train,xtest = Boston.test,ytest = y.test,ntree=500,mtry=ncol(Boston.train))
rfmodel2<-randomForest(Boston.train,y.train,xtest = Boston.test,ytest = y.test,ntree=500,mtry=(ncol(Boston.train))/2)
rfmodel3<-randomForest(Boston.train,y.train,xtest = Boston.test,ytest = y.test,ntree=500,mtry=(ncol(Boston.train))^(0.5))
rfmodel4<-randomForest(Boston.train,y.train,xtest = Boston.test,ytest = y.test,ntree=500,mtry=(ncol(Boston.train))/3)
rfmodel5<-randomForest(Boston.train,y.train,xtest = Boston.test,ytest = y.test,ntree=500,mtry=(ncol(Boston.train))/4)
A continuación graficaremos.
plot(1:500,rfmodel1$test$mse,col="red",type="l",xlab = "Number of Trees",ylab = "Test MSE",ylim = c(10,25))
lines(1:500,rfmodel2$test$mse, col="orange",type="l")
lines(1:500,rfmodel3$test$mse, col="green",type="l")
lines(1:500,rfmodel4$test$mse, col="blue",type="l")
lines(1:500,rfmodel5$test$mse, col="black",type="l")
legend("topright",c("m=p=13","m=p/2","m=sqrt(p)","m=p/3","m=p/4"),col=c("red","orange","green","blue","black"),cex=0.5,lty=1)
Se puede observar que la prueba disminuye cn el aumento de árboles. Luego de un cierto de número de arboles se estabiliza.
El error mínimo se ve cuando se elige m = sqrt (p)
attach(Carseats)
set.seed(1)
subset<-sample(nrow(Carseats),nrow(Carseats)*0.7)
car.train<-Carseats[subset,]
car.test<-Carseats[-subset,]
car.model.train<-tree(Sales~.,car.train)
summary(car.model.train)
##
## Regression tree:
## tree(formula = Sales ~ ., data = car.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Income" "CompPrice"
## [6] "Advertising"
## Number of terminal nodes: 18
## Residual mean deviance: 2.409 = 631.1 / 262
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.77800 -0.96100 -0.08865 0.00000 1.01800 4.14100
plot(car.model.train)
text(car.model.train,pretty=0)
tree.prediction<-predict(car.model.train,newdata=car.test)
tree.mse<-mean((car.test$Sales-tree.prediction)^2)
tree.mse
## [1] 4.208383
El MSE es de 4.3 aproximadamente.
set.seed(1)
cv.car<-cv.tree(car.model.train)
plot(cv.car$size,cv.car$dev,xlab = "Size of Tree",ylab = "Deviance",type = "b")
prune.car<-prune.tree(car.model.train,best=6)
plot(prune.car)
text(prune.car,pretty=0)
prune.predict<-predict(prune.car,car.test)
mean((prune.predict-car.test$Sales)^2)
## [1] 5.118217
Hemos utilizado la validación cruzada para encontrar el tamaño al que se debe podar el árbol.
Para el árbol podado obtenemos MSE de 5.2 aproximadamente
bag.car<-randomForest(Sales~.,car.train,importance=TRUE,mtry=13)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within
## valid range
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 34.6322504 233.60705
## Income 5.3645204 116.93827
## Advertising 18.8175105 153.05938
## Population -3.0858810 64.26621
## Price 70.9386948 698.15948
## ShelveLoc 74.2328945 645.49148
## Age 20.8561302 224.71954
## Education 1.3723565 61.47839
## Urban -1.9986734 10.51832
## US 0.8095402 10.19895
bag.car.predict<-predict(bag.car,car.test)
mean((bag.car.predict-car.test$Sales)^2)
## [1] 2.571169
El MSE obtenido es 2.6 aproximadamente. Se ha reducido aún más en comparación con un solo árbol podado. Por lo tanto, el ensacado ayudó a reducir el MSE
rf.car<-randomForest(Sales~.,car.train,importance=TRUE,mtry=sqrt(13))
importance(rf.car)
## %IncMSE IncNodePurity
## CompPrice 20.7300392 213.78497
## Income 3.5122804 148.32279
## Advertising 15.6121947 159.16307
## Population 0.5759461 113.69354
## Price 51.9015680 594.84872
## ShelveLoc 51.4866473 539.23503
## Age 18.6833946 261.97525
## Education 3.0894573 88.05427
## Urban -2.4726183 17.29229
## US 4.0933782 27.11808
rf.car.predict<-predict(rf.car,car.test)
mean((rf.car.predict-car.test$Sales)^2)
## [1] 2.674922
Usando Random Forest, el MSE aumentó.
library(ISLR)
attach(OJ)
set.seed(1013)
train = sample(dim(OJ)[1], 800)
OJ.train = OJ[train, ]
OJ.test = OJ[-train, ]
library(tree)
oj.tree = tree(Purchase ~ ., data = OJ.train)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "SalePriceMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7564 = 599.8 / 793
## Misclassification error rate: 0.1612 = 129 / 800
El algoritmo eligió las variables “LoyalCH”, “PriceDiff”, “ListPriceDiff”, “SalePriceMM” en el modelo
Índice de error de clasificación de árbol completo: 13.5% aproximadamente para datos de entrenamiento
Número de nodos terminales: 8
#oj.tree
El nodo terminal está representado por un asterisco.
plot(oj.tree)
text(oj.tree, pretty = 0)
El indicador más importante de compra parece ser “LoyalCH”
oj.pred = predict(oj.tree, OJ.test, type = "class")
table(OJ.test$Purchase, oj.pred)
## oj.pred
## CH MM
## CH 149 15
## MM 30 76
#oj.testerror<-(21+37)/nrow(oj.test)
#round(oj.testerror,3)
La tasa de error es del 21.5% aproximadamente.
cv.oj = cv.tree(oj.tree, FUN = prune.tree)
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")
Vemos que la desviación es mínima en los datos validados cruzados para el tamaño del árbol = 4
oj.pruned = prune.tree(oj.tree, best = 6)
summary(oj.pruned)
##
## Classification tree:
## snip.tree(tree = oj.tree, nodes = 12L)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 6
## Residual mean deviance: 0.7701 = 611.5 / 794
## Misclassification error rate: 0.175 = 140 / 800
Tasa de error de clasificación errónea de árbol completo para datos de entrenamiento: 13.5%
Tasa de error de clasificación errónea de árbol podado para datos de entrenamiento: 15.3% aproximadamente
La poda no ayudó a reducir el error de clasificación errónea en los datos de entrenamiento.
pred.unpruned = predict(oj.tree, OJ.test, type = "class")
misclass.unpruned = sum(OJ.test$Purchase != pred.unpruned)
misclass.unpruned/length(pred.unpruned)
## [1] 0.1666667
pred.pruned = predict(oj.pruned, OJ.test, type = "class")
misclass.pruned = sum(OJ.test$Purchase != pred.pruned)
misclass.pruned/length(pred.pruned)
## [1] 0.2
Tasa de error de clasificación errónea de árbol completo para datos de prueba: 21.5%
Tasa de error de clasificación errónea de árbol podado para datos de prueba: 20.4%
Encontramos que después de la poda, la tasa de clasificación errónea en los datos de prueba es menor en comparación con el árbol adulto. Esto significa que se está produciendo un sobreajuste para el árbol adulto, ya que dio un error de clasificación erróneo de entrenamiento más bajo pero un error de clasificación erróneo de prueba más alto.
attach(Hitters)
Hitters<-na.omit(Hitters)
Hitters$Salary<-log(Hitters$Salary)
subset<-1:200
hitters.train<-Hitters[subset,]
hitters.test<-Hitters[-subset,]
set.seed(1)
Definimos el rango
powerss<-seq(-2,0,by=0.1)
lambdas<-10^powerss
Definimos la lista de almacenamiento de errores
train.error<-rep(NA,length(lambdas))
Se usa lambdas para almacenar los errores.
for (i in 1:length(lambdas)){
hitters.gbm<-gbm(Salary~.,hitters.train,distribution = "gaussian",n.trees=1000,shrinkage=lambdas[i])
# Se predice el error
hitters.train.pred<-predict(hitters.gbm,hitters.train,n.trees=1000)
train.error[i]<-mean((hitters.train.pred-hitters.train$Salary)^2)
}
Se realiza un plot del MSE de entrennamiento contra lambdas.
#Plotting training MSE against Lambdas
plot(lambdas,train.error,type="b",xlab="Shrinkage Value(lambda)",ylab="Training MSE")
Se testea el MSE
set.seed(1)
Se usa lambdas para almacenar las pruebas de los errores.
test.error<-rep(NA,length(lambdas))
Los errores se almacenan en labdas.
for (i in 1:length(lambdas)){
hitters.gbm<-gbm(Salary~.,hitters.train,distribution = "gaussian",n.trees=1000,shrinkage=lambdas[i])
#Produciendo la prueba de error
hitters.test.pred<-predict(hitters.gbm,hitters.test,n.trees=1000)
test.error[i]<-mean((hitters.test.pred-hitters.test$Salary)^2)
}
Se grafica
plot(lambdas,test.error,type="b",xlab="Shrinkage Value(lambda)",ylab="Test MSE")
hitters.gbm.testerror<-min(test.error)
hitters.gbm.testerror
## [1] 0.255455
La prueba mínima obtenida es 0.25
Ajuste del modelo de regresión de mínimos cuadrados.
lm<-lm(Salary~.,hitters.train)
hitters.predict.lm<-predict(lm,hitters.test)
hitters.lm.test.mse<-mean((hitters.predict.lm-hitters.test$Salary)^2)
hitters.lm.test.mse
## [1] 0.4917959
Modelo de regresión de cresta. Se selecciona un valor s = 0.01 de lambda para que se ajuste al modelo.
x<-model.matrix(Salary~.,hitters.train)
x.test<-model.matrix(Salary ~ . , hitters.test)
y<-hitters.train$Salary
hitters.ridge<-glmnet(x,y,alpha=0)
hitters.ridge.predict<-predict(hitters.ridge,s=0.01,x.test)
hitters.ridge.test.mse<-mean((hitters.ridge.predict-hitters.test$Salary)^2)
hitters.ridge.test.mse
## [1] 0.4570283
Modelo de regresión láser Se selecciona un valor s = 0.01 de lambda para que se ajuste al modelo
x<-model.matrix(Salary~.,hitters.train)
x.test<-model.matrix(Salary ~ . , hitters.test)
y<-hitters.train$Salary
hitters.lasso<-glmnet(x,y,alpha=1)
hitters.lasso.predict<-predict(hitters.lasso,s=0.01,x.test)
hitters.lasso.test.mse<-mean((hitters.lasso.predict-hitters.test$Salary)^2)
hitters.lasso.test.mse
## [1] 0.4700537
Tenemos Test MSE para diferentes métodos como se resume a continuación. Se puede ver que Boosting ofrece menos MSE de prueba entre los 4 modelos
Prueba de modelo completo de regresión de mínimos cuadrados MSE: 0.49
Prueba de modelo de regresión de cresta MSE: 0.45
Prueba de modelo de regresión de lazo MSE: 0.47
boost.hitters<-gbm(Salary~.,data=hitters.train,distribution = "gaussian",n.trees = 1000,shrinkage=lambdas[which.min(test.error)])
summary(boost.hitters)
## var rel.inf
## CAtBat CAtBat 30.0391455
## Years Years 7.1722320
## CWalks CWalks 6.6962390
## PutOuts PutOuts 6.1919523
## Walks Walks 6.0430398
## CRuns CRuns 5.8184446
## CHmRun CHmRun 5.7355580
## CRBI CRBI 5.6678930
## Hits Hits 5.5180489
## HmRun HmRun 3.7274075
## Assists Assists 3.6165621
## AtBat AtBat 2.8770937
## RBI RBI 2.8062318
## CHits CHits 2.8030774
## Errors Errors 2.3509666
## Runs Runs 1.5093746
## Division Division 0.6614964
## NewLeague NewLeague 0.5632541
## League League 0.2019828
CATBat es la variable más importante.
set.seed(1)
hitters.bagging<-randomForest(Salary~.,hitters.train,mtry=19,importance=TRUE)
hitters.bagg.predict<-predict(hitters.bagging,hitters.test)
hitters.bagg.test.mse<-mean((hitters.bagg.predict-hitters.test$Salary)^2)
hitters.bagg.test.mse
## [1] 0.2301184
Para el conjunto de prueba MSE es 0.23
attach(Caravan)
## The following object is masked from OJ:
##
## Purchase
set.seed(1)
caravan.subset<-1:1000
Caravan$Purchase<-ifelse(Caravan$Purchase=="Yes",1,0)
caravan.train<-Caravan[caravan.subset,]
caravan.test<-Caravan[-caravan.subset,]
set.seed(1)
caravan.boost<-gbm(Purchase~.,caravan.train,shrinkage = 0.01,n.trees = 1000,distribution = "bernoulli")
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution =
## distribution, : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution =
## distribution, : variable 71: AVRAAUT has no variation.
datatable(summary(caravan.boost), class="table-condensed", style="bootstrap", options = list(dom = 'tp'))
Predicción usando Boosting.
caravan.predict.boost<-predict(caravan.boost,caravan.test,n.trees = 1000,type="response")
caravan.predict.prob.boost<-ifelse(caravan.predict.boost>0.2,1,0)
table(caravan.test$Purchase,caravan.predict.prob.boost,dnn=c("Actual","Predicted"))
## Predicted
## Actual 0 1
## 0 4410 123
## 1 256 33
TPF<-33/(123+33)
TPF
## [1] 0.2115385
Regresión logística.
caravan.logit<-glm(Purchase~.,caravan.train,family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
carvan.predict.logit<-predict(caravan.logit,caravan.test,type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
caravan.predict.prob.logit<-ifelse(carvan.predict.logit>0.2,1,0)
table(caravan.test$Purchase,caravan.predict.prob.logit,dnn=c("Actual","Predicted"))
## Predicted
## Actual 0 1
## 0 4183 350
## 1 231 58
TPF.logit<-58/(350+58)
TPF.logit
## [1] 0.1421569
El 21% de las personas pronosticaron hacer una compra por Boosting Model
El 14% de las personas pronosticaron realizar una compra por el Modelo de regresión logística
Es mejor usar el modelo de refuerzo con los clientes que el mmodelo de regresión logística.
Cargamos el Data Set
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
kable(head(german_credit,10))
| chk_acct | duration | credit_his | purpose | amount | saving_acct | present_emp | installment_rate | sex | other_debtor | present_resid | property | age | other_install | housing | n_credits | job | n_people | telephone | foreign | response |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| A11 | 6 | A34 | A43 | 1169 | A65 | A75 | 4 | A93 | A101 | 4 | A121 | 67 | A143 | A152 | 2 | A173 | 1 | A192 | A201 | 1 |
| A12 | 48 | A32 | A43 | 5951 | A61 | A73 | 2 | A92 | A101 | 2 | A121 | 22 | A143 | A152 | 1 | A173 | 1 | A191 | A201 | 2 |
| A14 | 12 | A34 | A46 | 2096 | A61 | A74 | 2 | A93 | A101 | 3 | A121 | 49 | A143 | A152 | 1 | A172 | 2 | A191 | A201 | 1 |
| A11 | 42 | A32 | A42 | 7882 | A61 | A74 | 2 | A93 | A103 | 4 | A122 | 45 | A143 | A153 | 1 | A173 | 2 | A191 | A201 | 1 |
| A11 | 24 | A33 | A40 | 4870 | A61 | A73 | 3 | A93 | A101 | 4 | A124 | 53 | A143 | A153 | 2 | A173 | 2 | A191 | A201 | 2 |
| A14 | 36 | A32 | A46 | 9055 | A65 | A73 | 2 | A93 | A101 | 4 | A124 | 35 | A143 | A153 | 1 | A172 | 2 | A192 | A201 | 1 |
| A14 | 24 | A32 | A42 | 2835 | A63 | A75 | 3 | A93 | A101 | 4 | A122 | 53 | A143 | A152 | 1 | A173 | 1 | A191 | A201 | 1 |
| A12 | 36 | A32 | A41 | 6948 | A61 | A73 | 2 | A93 | A101 | 2 | A123 | 35 | A143 | A151 | 1 | A174 | 1 | A192 | A201 | 1 |
| A14 | 12 | A32 | A43 | 3059 | A64 | A74 | 2 | A91 | A101 | 4 | A121 | 61 | A143 | A152 | 1 | A172 | 1 | A191 | A201 | 1 |
| A12 | 30 | A34 | A40 | 5234 | A61 | A71 | 4 | A94 | A101 | 2 | A123 | 28 | A143 | A152 | 2 | A174 | 1 | A191 | A201 | 2 |
Vemos la dimensión.
dim(german_credit)
## [1] 1000 21
Vemos el tipo de variables.
str(german_credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ chk_acct : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
## $ duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_his : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ purpose : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ saving_acct : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
## $ present_emp : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
## $ installment_rate: int 4 2 2 2 3 2 3 2 2 4 ...
## $ sex : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
## $ other_debtor : Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
## $ present_resid : int 4 2 3 4 4 4 4 2 4 2 ...
## $ property : Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_install : Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ housing : Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
## $ n_credits : int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
## $ n_people : int 1 1 2 2 2 2 1 1 1 1 ...
## $ telephone : Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
## $ foreign : Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
## $ response : int 1 2 1 1 2 1 1 1 1 2 ...
Se cambian lso tipos de variables de respuesta. se cambia 2 y 1 por 1 y 0, teniendo como 0 = buena y 1 = malo.
german_credit$response = german_credit$response - 1 # Run this once
german_credit$response = as.factor(german_credit$response)
present_resid <- as.factor(german_credit$present_resid)
Miramos el resumen.
summary(german_credit)
## chk_acct duration credit_his purpose amount
## A11:274 Min. : 4.0 A30: 40 A43 :280 Min. : 250
## A12:269 1st Qu.:12.0 A31: 49 A40 :234 1st Qu.: 1366
## A13: 63 Median :18.0 A32:530 A42 :181 Median : 2320
## A14:394 Mean :20.9 A33: 88 A41 :103 Mean : 3271
## 3rd Qu.:24.0 A34:293 A49 : 97 3rd Qu.: 3972
## Max. :72.0 A46 : 50 Max. :18424
## (Other): 55
## saving_acct present_emp installment_rate sex other_debtor
## A61:603 A71: 62 Min. :1.000 A91: 50 A101:907
## A62:103 A72:172 1st Qu.:2.000 A92:310 A102: 41
## A63: 63 A73:339 Median :3.000 A93:548 A103: 52
## A64: 48 A74:174 Mean :2.973 A94: 92
## A65:183 A75:253 3rd Qu.:4.000
## Max. :4.000
##
## present_resid property age other_install housing
## Min. :1.000 A121:282 Min. :19.00 A141:139 A151:179
## 1st Qu.:2.000 A122:232 1st Qu.:27.00 A142: 47 A152:713
## Median :3.000 A123:332 Median :33.00 A143:814 A153:108
## Mean :2.845 A124:154 Mean :35.55
## 3rd Qu.:4.000 3rd Qu.:42.00
## Max. :4.000 Max. :75.00
##
## n_credits job n_people telephone foreign response
## Min. :1.000 A171: 22 Min. :1.000 A191:596 A201:963 0:700
## 1st Qu.:1.000 A172:200 1st Qu.:1.000 A192:404 A202: 37 1:300
## Median :1.000 A173:630 Median :1.000
## Mean :1.407 A174:148 Mean :1.155
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :4.000 Max. :2.000
##
Vemos que no hay datos faltantes.
Se realiza una división de los taos, de capacitación y de validación para una proporción de 75:25
set.seed(10743959)
subset<-sample(nrow(german_credit),nrow(german_credit)*0.75)
germancredit.train<-german_credit[subset,]
germancredit.test<-german_credit[-subset,]
Se realiza la regresión logística.
lr.gc.model<-glm(response ~ .,family = binomial,germancredit.train)
lr.gc.predict<-predict(lr.gc.model,germancredit.test,type="response")
Se selecciona con probabilidad de 0.5 los malos y buenos clientes.
lr.gc.predict<-ifelse(lr.gc.predict>0.5,1,0)
table(germancredit.test$response,lr.gc.predict,dnn = c("Actual ","Predicted "))
## Predicted
## Actual 0 1
## 0 145 31
## 1 36 38
lr.mr<-mean(ifelse(lr.gc.predict!=germancredit.test$response,1,0))
round(lr.mr,2)
## [1] 0.27
set.seed(10743959)
Construcción del modelo.
bag.gc.model<-randomForest(response ~ .,germancredit.train,ntree=1000,mtry=20,importance=TRUE)
Predicción de las pruebas.
bag.gc.predict<-predict(bag.gc.model,germancredit.test,type="class")
table(germancredit.test$response,bag.gc.predict,dnn = c("Actual ","Predicted "))
## Predicted
## Actual 0 1
## 0 157 19
## 1 37 37
Se realiza la prueba de tasa de clasificación errónea.
bagging.mr<-(17+39)/nrow(germancredit.test)
bagging.mr
## [1] 0.224
set.seed(10743959)
Realizamos la construcción del modelo.
rf.gc.model<-randomForest(response ~ .,germancredit.train,ntree=1000,mtry=sqrt(20),importance=TRUE)
Predicción de las pruebas.
rf.gc.predict<-predict(rf.gc.model,germancredit.test,type="class")
table(germancredit.test$response,rf.gc.predict,dnn = c("Actual ","Predicted "))
## Predicted
## Actual 0 1
## 0 160 16
## 1 44 30
Se realiza la prueba de tasa de clasificación errónea.
rf.mr<-(16+49)/nrow(germancredit.test)
rf.mr
## [1] 0.26
Vemos que la tasa de clasificación errónea del conjunto de prueba con diferentes técnicas se tienen los siguientes resultados:
Regresión logística: 0.26 Bagging: 0.22 Randon Forest: 0.26
Bagging parece ser la que da un error más bajo.
Se crean los datos necesarios.
theme_set(theme_tufte(base_size = 14))
set.seed(1)
df <- data.frame(replicate(2, rnorm(n = 100)))
df <- as.tibble(df)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
circle <- function(x, y) {
x^2 + y^2 <= 0.6
}
df <- df %>%
rename(Var1 = X1, Var2 = X2) %>%
mutate(Class = ifelse(circle(Var1, Var2),
'Class A',
'Class B'),
Class = factor(Class))
ggplot(df, aes(Var1, Var2, col = Class)) +
geom_point(size = 2)
inTrain <- sample(nrow(df), nrow(df)*0.6, replace = FALSE)
training <- df[inTrain,]
testing <- df[-inTrain,]
Clasificador
svm_basic <- svm(Class ~ ., data = training,
kernel = 'linear',
scale = FALSE, cost = 3)
plot(svm_basic, data = training)
mean(predict(svm_basic, testing) == testing$Class)
## [1] 0.675
mean(testing$Class == 'Class B')
## [1] 0.675
Ajuste de SVM con núcleo polinomial
svm_poly <- svm(Class ~ ., data = training,
kernel = 'polynomial', degree = 8,
scale = FALSE, cost = 4)
plot(svm_poly, data = df)
mean(predict(svm_poly, testing) == testing$Class)
## [1] 0.825
Montaje de SVM con núcleo radial
svm_radial <- svm(Class ~ ., data = training,
kernel = 'radial',
scale = FALSE, cost = 1)
plot(svm_radial, data = df)
mean(predict(svm_radial, testing) == testing$Class)
## [1] 0.95
Nuevo dataset.
theme_set(theme_tufte(base_size = 14))
set.seed(1)
df <- data.frame(replicate(2, rnorm(n = 100)))
df <- as.tibble(df)
class_func <- function(x, y) {
x^2 + y - x*y <= mean(abs(x) + abs(y))
}
df <- df %>%
rename(Var1 = X1, Var2 = X2) %>%
mutate(Class = ifelse(class_func(Var1, Var2),
'Class A',
'Class B'),
Class = factor(Class))
ggplot(df, aes(Var1, Var2, col = Class)) +
geom_point(size = 2)
inTrain <- sample(nrow(df), nrow(df)*0.6, replace = FALSE)
training <- df[inTrain,]
testing <- df[-inTrain,]
Clasificador de vectores de soporte de montaje.
svm_basic <- svm(Class ~ ., data = training,
kernel = 'linear',
scale = FALSE, cost = 3)
plot(svm_basic, data = training)
mean(predict(svm_basic, testing) == testing$Class)
## [1] 0.75
mean(testing$Class == 'Class B')
## [1] 0.25
Ajuste de SVM con núcleo polinomial
svm_poly <- svm(Class ~ ., data = training,
kernel = 'polynomial', degree = 8,
scale = FALSE, cost = 4)
plot(svm_poly, data = df)
mean(predict(svm_poly, testing) == testing$Class)
## [1] 0.9
Montaje de SVM con núcleo radial
svm_radial <- svm(Class ~ ., data = training,
kernel = 'radial',
scale = FALSE, cost = 1)
plot(svm_radial, data = df)
mean(predict(svm_radial, testing) == testing$Class)
## [1] 0.825
We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
set.seed(1)
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- as.integer(x1 ^ 2 - x2 ^ 2 > 0)
plot(x1[y == 0], x2[y == 0], col = "red", xlab = "X1", ylab = "X2")
points(x1[y == 1], x2[y == 1], col = "blue")
dat <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
lr.fit <- glm(y ~ ., data = dat, family = 'binomial')
lr.prob <- predict(lr.fit, newdata = dat, type = 'response')
lr.pred <- ifelse(lr.prob > 0.5, 1, 0)
plot(dat$x1, dat$x2, col = lr.pred + 2)
lr.nl <- glm(y ~ poly(x1, 2) + poly(x2, 2), data = dat, family = 'binomial')
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lr.nl)
##
## Call:
## glm(formula = y ~ poly(x1, 2) + poly(x2, 2), family = "binomial",
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.079e-03 -2.000e-08 -2.000e-08 2.000e-08 1.297e-03
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -94.48 2963.78 -0.032 0.975
## poly(x1, 2)1 3442.52 104411.28 0.033 0.974
## poly(x1, 2)2 30110.74 858421.66 0.035 0.972
## poly(x2, 2)1 162.82 26961.99 0.006 0.995
## poly(x2, 2)2 -31383.76 895267.48 -0.035 0.972
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9218e+02 on 499 degrees of freedom
## Residual deviance: 4.2881e-06 on 495 degrees of freedom
## AIC: 10
##
## Number of Fisher Scoring iterations: 25
lr.prob.nl <- predict(lr.nl, newdata = dat, type = 'response')
lr.pred.nl <- ifelse(lr.prob.nl > 0.5, 1, 0)
plot(dat$x1, dat$x2, col = lr.pred.nl + 2)
Las predicciones son mejores que las del modelo lineal.
svm.lin <- svm(y ~ ., data = dat, kernel = 'linear', cost = 0.01)
plot(svm.lin, dat)
svm.nl <- svm(y ~ ., data = dat, kernel = 'radial', gamma = 1)
plot(svm.nl, data = dat)
El efecto de la regresión logística lineal en el límite no lineal es muy pobre. El efecto del núcleo lineal SVM es pequeño cuando se usa un costo pequeño, y el efecto de la regresión logística no lineal y SVM en el límite no lineal es muy bueno.
set.seed(1)
obs = 1000
x1 <- runif(obs, min = -4, max = 4)
x2 <- runif(obs, min = -1, max = 16)
y <- ifelse(x2 > x1 ^ 2, 0, 1)
dat <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
train <- sample(obs, obs/2)
dat.train <- dat[train, ]
dat.test <- dat[-train, ]
par(mfrow = c(1,2))
plot(dat.train$x1, dat.train$x2, col = as.integer(dat.train$y) + 1, main = 'training set')
plot(dat.test$x1, dat.test$x2, col = as.integer(dat.test$y) + 1, main = 'test set')
set.seed(1)
cost.grid <- c(0.001, 0.01, 0.1, 1, 5, 10, 100, 10000)
tune.out <- tune(svm, y ~., data = dat.train, kernel = 'linear', ranges = list(cost = cost.grid))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.25
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.392 0.04825856
## 2 1e-02 0.256 0.05947922
## 3 1e-01 0.252 0.05902918
## 4 1e+00 0.252 0.06051630
## 5 5e+00 0.250 0.06271629
## 6 1e+01 0.250 0.06271629
## 7 1e+02 0.250 0.06271629
## 8 1e+04 0.390 0.05270463
Errores de entrenamiento de los modelos con diferente valor de costo:
err.rate.train <- rep(NA, length(cost.grid))
for (cost in cost.grid) {
svm.fit <- svm(y ~ ., data = dat.train, kernel = 'linear', cost = cost)
plot(svm.fit, data = dat.train)
res <- table(prediction = predict(svm.fit, newdata = dat.train), truth = dat.train$y)
err.rate.train[match(cost, cost.grid)] <- (res[2,1] + res[1,2]) / sum(res)
}
err.rate.train
## [1] 0.392 0.246 0.248 0.248 0.248 0.248 0.248 0.392
paste('The cost', cost.grid[which.min(err.rate.train)], 'has the minimum training error:', min(err.rate.train))
## [1] "The cost 0.01 has the minimum training error: 0.246"
El resultado óptimo es inconsistente con el resultado de la validación cruzada. A medida que el costo aumenta, el error de entrenamiento debería disminuir, pero hay aumentos y caídas aquí. La razón no está clara.
err.rate.test <- rep(NA, length(cost.grid))
for (cost in cost.grid) {
svm.fit <- svm(y ~ ., data = dat.train, kernel = 'linear', cost = cost)
res <- table(prediction = predict(svm.fit, newdata = dat.test), truth = dat.test$y)
err.rate.test[match(cost, cost.grid)] <- (res[2,1] + res[1,2]) / sum(res)
}
err.rate.test
## [1] 0.362 0.216 0.216 0.218 0.220 0.220 0.218 0.362
paste('The cost', cost.grid[which.min(err.rate.test)], 'has the minimum test error:', min(err.rate.test))
## [1] "The cost 0.01 has the minimum test error: 0.216"
El resultado óptimo es consistente con el resultado de la validación cruzada, y ambos son óptimos cuando el costo = 0.1.
En este caso, no importa cómo ajuste la tasa de error de costo es relativamente alta, por lo tanto, cuando se usan diferentes costos, la tasa de error no ha cambiado significativamente y siempre ha sido muy alta, lo que puede deberse a una selección incorrecta del núcleo.
theme_set(theme_tufte(base_size = 14))
set.seed(1)
data(Auto)
Auto <- as.tibble(Auto)
Auto <- Auto %>%
mutate(highmpg = as.integer(mpg > median(mpg))) %>%
mutate(highmpg = factor(highmpg),
cylinders = factor(cylinders))
Auto %>%
sample_n(5) %>%
select(mpg, highmpg)
## # A tibble: 5 x 2
## mpg highmpg
## <dbl> <fct>
## 1 44.3 1
## 2 23 1
## 3 26 1
## 4 23.9 1
## 5 23.2 1
Auto <- Auto %>%
select(-mpg, -name)
dummy_trans <- dummyVars(highmpg ~ ., data = Auto)
Auto_dummy <- predict(dummy_trans, Auto)
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev
## = object$lvls): variable 'highmpg' is not a factor
svm_linear <- train(x = Auto_dummy, y = Auto$highmpg,
method = 'svmLinear2',
trControl = trainControl(method = 'cv', number = 10, allowParallel = TRUE),
preProcess = c('center', 'scale'),
tuneGrid = expand.grid(cost = seq(1, 20, by = 1)))
svm_linear$finalModel
##
## Call:
## svm.default(x = as.matrix(x), y = y, kernel = "linear", cost = param$cost,
## probability = classProbs)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 2
##
## Number of Support Vectors: 75
svm_poly <- train(x = Auto_dummy, y = Auto$highmpg,
method = 'svmPoly',
trControl = trainControl(method = 'cv', number = 10, allowParallel = TRUE),
preProcess = c('center', 'scale'),
tuneGrid = expand.grid(degree = seq(1, 8, by = 1),
C = seq(1, 5, by = 1),
scale = TRUE))
svm_poly$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Polynomial kernel function.
## Hyperparameters : degree = 2 scale = TRUE offset = 1
##
## Number of Support Vectors : 71
##
## Objective Function Value : -45.587
## Training error : 0.045918
svm_radial <- train(x = Auto_dummy, y = Auto$highmpg,
method = 'svmRadial',
trControl = trainControl(method = 'cv', number = 10, allowParallel = TRUE),
preProcess = c('center', 'scale'),
tuneGrid = expand.grid(C = seq(0.001, 3, length.out = 10),
sigma = seq(0.2, 2, length.out = 5)))
svm_radial$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1.00066666666667
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 1.55
##
## Number of Support Vectors : 230
##
## Objective Function Value : -73.7206
## Training error : 0.02551
plot(svm_linear)
plot(svm_poly)
plot(svm_radial)
postResample(predict(svm_linear), Auto$highmpg)
## Accuracy Kappa
## 0.9285714 0.8571429
postResample(predict(svm_poly), Auto$highmpg)
## Accuracy Kappa
## 0.9540816 0.9081633
postResample(predict(svm_radial), Auto$highmpg)
## Accuracy Kappa
## 0.9744898 0.9489796
data('OJ')
inTrain <- sample(nrow(OJ), 800, replace = FALSE)
training <- OJ[inTrain,]
testing <- OJ[-inTrain,]
svm_linear <- svm(Purchase ~ ., data = training,
kernel = 'linear',
cost = 0.01)
summary(svm_linear)
##
## Call:
## svm(formula = Purchase ~ ., data = training, kernel = "linear",
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 432
##
## ( 216 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
postResample(predict(svm_linear, training), training$Purchase)
## Accuracy Kappa
## 0.8300000 0.6330795
postResample(predict(svm_linear, testing), testing$Purchase)
## Accuracy Kappa
## 0.8148148 0.6093750
svm_linear_tune <- train(Purchase ~ ., data = training,
method = 'svmLinear2',
trControl = trainControl(method = 'cv', number = 10),
preProcess = c('center', 'scale'),
tuneGrid = expand.grid(cost = seq(0.01, 10, length.out = 20)))
svm_linear_tune
## Support Vector Machines with Linear Kernel
##
## 800 samples
## 17 predictor
## 2 classes: 'CH', 'MM'
##
## Pre-processing: centered (17), scaled (17)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 720, 720, 720, 721, 720, 719, ...
## Resampling results across tuning parameters:
##
## cost Accuracy Kappa
## 0.0100000 0.8237762 0.6214852
## 0.5357895 0.8287762 0.6338467
## 1.0615789 0.8287920 0.6339144
## 1.5873684 0.8287920 0.6339144
## 2.1131579 0.8275420 0.6314674
## 2.6389474 0.8300420 0.6368396
## 3.1647368 0.8312920 0.6396285
## 3.6905263 0.8300262 0.6370727
## 4.2163158 0.8300262 0.6370727
## 4.7421053 0.8275262 0.6317216
## 5.2678947 0.8287762 0.6341989
## 5.7936842 0.8287762 0.6341989
## 6.3194737 0.8312762 0.6391430
## 6.8452632 0.8275262 0.6313912
## 7.3710526 0.8287762 0.6338684
## 7.8968421 0.8275262 0.6314125
## 8.4226316 0.8275262 0.6314125
## 8.9484211 0.8287762 0.6338684
## 9.4742105 0.8275262 0.6314125
## 10.0000000 0.8287607 0.6341957
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cost = 3.164737.
postResample(predict(svm_linear_tune, training), training$Purchase)
## Accuracy Kappa
## 0.836250 0.649367
postResample(predict(svm_linear_tune, testing), testing$Purchase)
## Accuracy Kappa
## 0.8296296 0.6417445
set.seed(410)
svm.radial = svm(Purchase ~ ., data = OJ.train, kernel = "radial")
summary(svm.radial)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 375
##
## ( 185 190 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.radial, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 446 43
## MM 75 236
(42 + 74)/(441 + 42 + 74 + 243)
## [1] 0.145
test.pred = predict(svm.radial, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 145 19
## MM 25 81
(27 + 22)/(148 + 22 + 27 + 73)
## [1] 0.1814815
set.seed(755)
tune.out = tune(svm, Purchase ~ ., data = OJ.train, kernel = "radial", ranges = list(cost = 10^seq(-2,
1, by = 0.25)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.165
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.38875 0.05510407
## 2 0.01778279 0.38875 0.05510407
## 3 0.03162278 0.37875 0.06562996
## 4 0.05623413 0.20875 0.03387579
## 5 0.10000000 0.19125 0.02360703
## 6 0.17782794 0.18875 0.02791978
## 7 0.31622777 0.17375 0.02664713
## 8 0.56234133 0.16625 0.02889757
## 9 1.00000000 0.16500 0.02622022
## 10 1.77827941 0.16500 0.02342245
## 11 3.16227766 0.17000 0.02371708
## 12 5.62341325 0.17250 0.02687419
## 13 10.00000000 0.18125 0.02585349
svm.radial = svm(Purchase ~ ., data = OJ.train, kernel = "radial", cost = tune.out$best.parameters$cost)
train.pred = predict(svm.radial, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 446 43
## MM 75 236
(81 + 43)/(440 + 43 + 81 + 236)
## [1] 0.155
test.pred = predict(svm.radial, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 145 19
## MM 25 81
(28 + 25)/(145 + 25 + 28 + 72)
## [1] 0.1962963
set.seed(8112)
svm.poly = svm(Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2)
summary(svm.poly)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "poly",
## degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 456
##
## ( 225 231 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.poly, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 454 35
## MM 114 197
(33 + 111)/(450 + 33 + 111 + 206)
## [1] 0.18
test.pred = predict(svm.poly, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 153 11
## MM 40 66
(21 + 34)/(149 + 21 + 34 + 66)
## [1] 0.2037037
set.seed(322)
tune.out = tune(svm, Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2,
ranges = list(cost = 10^seq(-2, 1, by = 0.25)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.1725
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.38625 0.03087272
## 2 0.01778279 0.36500 0.03162278
## 3 0.03162278 0.35375 0.03064696
## 4 0.05623413 0.33250 0.04048319
## 5 0.10000000 0.31875 0.04050463
## 6 0.17782794 0.25250 0.03763863
## 7 0.31622777 0.21125 0.04267529
## 8 0.56234133 0.20250 0.04241004
## 9 1.00000000 0.19625 0.04126894
## 10 1.77827941 0.19000 0.04556741
## 11 3.16227766 0.18875 0.05185785
## 12 5.62341325 0.18375 0.05001736
## 13 10.00000000 0.17250 0.04518481
svm.poly = svm(Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2, cost = tune.out$best.parameters$cost)
train.pred = predict(svm.poly, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 450 39
## MM 81 230
(36 + 85)/(447 + 36 + 85 + 232)
## [1] 0.15125
test.pred = predict(svm.poly, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 148 16
## MM 25 81
(22 + 28)/(148 + 22 + 28 + 72)
## [1] 0.1851852
En general, los modelos son muy similares, pero el núcleo radial funciona mejor con un pequeño margen.